Analyzing social media data with Python¶

Fletcher Heisler¶

In [1]:

from IPython.display import Image
Image(url="http://trackmaven.com/assets/tm_logo_horizontal.png")

Out[1]:

fletcher@trackmaven.com ¶

github.com/fheisler ¶

@fheisler ¶

-------------------------------------------¶

Using the requests library¶

In [2]:

import requests
response = requests.get("http://isitlunchtimeyet.com/", auth=('fheisler', 'passw0rd'))
response

Out[2]:

<Response [200]>

In [3]:

print response.text

<html>
    <head>
        <script type="text/javascript" src="isit.js"></script>
        <link rel="stylesheet" type="text/css" href="s.css" />
        <title>Is it lunch time yet?</title>
    </head>
    <body>
        <div>by <a href="http://www.butterfat.net/">Butterfat, LLC</a></div>
    </body>
</html>

Writing custom API clients¶

In [4]:

import requests


class RateLimitError(Exception):
    pass

class ClientError(Exception):
    pass

class APIClient(object):
    """
    A generic API client to handle interaction using the
    requests module; throw specific errors based on various responses
    """

    # Should be overwritten by implementations, i.e. HTTPError=FacebookHTTPError
    HTTPError = requests.exceptions.HTTPError
    ConnectionError = requests.exceptions.ConnectionError
    Error = ClientError
    RateLimitError = RateLimitError

    def __init__(self):
        """ Throw exceptions if BASE_URI or BASE_PARAMS are not set """
        if not hasattr(self, 'BASE_URI'):
            raise NotImplementedError('Must specify a base uri')
        if not hasattr(self, 'BASE_PARAMS'):
            raise NotImplementedError('Must specify base params')

    def _validate_response(self, response):
        """ Optional response validation """
        pass

    def _validate_error(self, response):
        """
        Optional error validation, for when you want to raise a specific
        exception, for example rate limit exceptions.
        """
        pass

    def _get(self, url, params={}, timeout=10, retries=3):
        """
        Gets a response based on the url and params passed to it; 
        will retry 3 times if there is a connection error
        """
        full_url = '{}/{}'.format(self.BASE_URI, url)
        while retries > 0:
            try:
                if hasattr(self, 'BASE_PARAMS'):
                    params.update(self.BASE_PARAMS)
                response = requests.get(full_url, params=params)
                try:
                    response.raise_for_status()
                    self._validate_response(response)
                    return response
                except requests.exceptions.HTTPError as e:
                    self._validate_error(response)
                    raise self.HTTPError(e)
            except (requests.exceptions.ConnectionError,
                    requests.exceptions.Timeout) as e:
                retries -= 1
                if not retries:
                    raise self.ConnectionError(e)

    def _json(self, response):
        """ Try to convert the response to JSON """
        try:
            return response.json()
        except Exception as e:
            raise self.Error(e)

Key lessons learned:¶

Include specific errors for logging
Prepare for slow and unreliable responses
Keep it DRY

Analyzing data¶

In [5]:

import pandas as pd
import matplotlib.pyplot as plt

Load in data via a file, database, clipboard...¶

In [6]:

!pwd
!ls

/Users/fheisler/pyohio
PyOhio - Social media data analysis.ipynb instagram.db
blog.csv                                  requirements.txt

In [7]:

import sqlite3

# Import Instagram picture data from SQL table
connection = sqlite3.connect('instagram.db')
instagram_data = pd.io.sql.read_sql("SELECT * FROM instagram;", con=connection)

Summary statistics¶

In [8]:

instagram_data.head()

Out[8]:

	id	account	filter	likes	caption	timestamp
0	31044	murphyoilusa	Valencia	7	Elvis is in the building at Murphy Express 855...	140109 11:17
1	32057	murphyoilusa	Normal	5	Cool tanker! #murphyusa	140129 11:40
2	6359	murphyoilusa	Rise	6	Looking up! #murphyusa	121205 12:11
3	6347	murphyoilusa	Hefe	4	Quick Strike Energy! #murphyusa	121205 12:23
4	6351	murphyoilusa	X-Pro II	4	Look out, it's Super Murph! #murphyusa	121205 12:16

In [9]:

instagram_data.dtypes

Out[9]:

id            int64
account      object
filter       object
likes         int64
comments      int64
caption      object
timestamp    object
dtype: object

In [10]:

instagram_data.describe()

Out[10]:

	id	likes	comments
count	34575.000000	34575.000000	34575.000000
mean	21677.659205	3690.893304	50.993955
std	10841.855251	13330.935345	218.687328
min	3451.000000	0.000000	0.000000
25%	12272.500000	31.000000	0.000000
50%	20929.000000	208.000000	4.000000
75%	31439.500000	1297.000000	21.000000
max	40191.000000	260107.000000	12567.000000

In [11]:

instagram_data['social_actions'] = instagram_data.likes + instagram_data.comments
instagram_data['social_actions'].median()

Out[11]:

215.0

Visualize the data¶

In [12]:

instagram_data.social_actions.hist(bins=50);

In [13]:

instagram_data[instagram_data.social_actions > 50000].social_actions.hist(bins=50)
plt.title("Distribution of social actions (50k+)")
plt.ylabel("Total likes + comments")
plt.xlabel("Number of Instagram pictures");

Explore top lists¶

In [14]:

# Top 10 filters by usage
top_filters = instagram_data.groupby('filter').size().order(ascending=False)
(100 * top_filters / float(sum(top_filters)))[:10]

Out[14]:

filter
Normal      58.522054
Lo-fi        5.512654
Amaro        5.055676
Mayfair      4.937093
Valencia     4.827187
X-Pro II     4.043384
Rise         3.704989
Hudson       3.271150
Hefe         2.111352
Sierra       1.512654
dtype: float64

In [15]:

# Top 10 filters by social engagement
social_actions = instagram_data.groupby('filter').social_actions
(social_actions.sum() / social_actions.count()).order(ascending=False)[:10]

Out[15]:

filter
Normal      5285.500198
Willow      3827.500000
Sierra      2721.579350
Valencia    2259.502097
Amaro       2176.354119
Hudson      1932.223696
Mayfair     1900.403632
1977        1476.029851
Rise        1305.064793
Lo-fi       1174.589192
Name: social_actions, dtype: float64

In [16]:

# Top pics using Willow and Sierra filter
filter_top_filters = instagram_data['filter'].isin(["Willow", "Sierra"])
instagram_data[filter_top_filters].sort('social_actions', ascending=False)[:10]

Out[16]:

	id	account	filter	likes	comments	caption	timestamp	social_actions
10658	31537	nike	Sierra	177037	3242	Finish the season on your terms. #justdoit	140119 22:18	180279
11182	26766	nike	Sierra	106291	567	There's only one receiver who's always open.@u...	131027 18:29	106858
11180	26705	nike	Sierra	103974	370	No gym. No trainer. No excuse. #justdoit@nikew...	131025 19:11	104344
17631	30798	starbucks	Sierra	92508	844	Sometimes you just need to stay in. #Cozy #Cof...	140103 13:08	93352
11175	22369	nike	Sierra	90401	408	Make your next run your best run.The @nikerunn...	130912 12:59	90809
17274	28780	starbucks	Sierra	88863	312	A little piece of home at 37,000 ft.#starbucks...	131127 14:27	89175
11220	7477	nike	Sierra	85967	536	A hist�ria n�o se escreve sozinha.History does...	130625 12:04	86503
17248	24237	starbucks	Sierra	74336	310	Follow your #heart. Celebrate what you #love. ...	130930 14:52	74646
17558	25366	starbucks	Willow	73481	273	Let weather be your excuse to #slowdown and #s...	131017 16:07	73754
17237	29043	starbucks	Willow	67709	226	Perfect cup. #PourOver #Coffee #Love	131204 13:31	67935

In [17]:

# Most engaging Instagram posters
posters = instagram_data.groupby('account').social_actions
top_posters = (posters.sum() / posters.count()).order(ascending=False)
top_posters[:5]

Out[17]:

account
starbucks     47178.250000
nike          41012.822034
disney        20712.481390
footlocker    20349.083185
apple         14354.261905
Name: social_actions, dtype: float64

In [18]:

# Average number of hashtags used by top brands
instagram_data['num_hashtags'] = instagram_data.caption.str.count("#")
tag_counts = instagram_data.groupby('account').num_hashtags
avg_tags = (tag_counts.sum() / tag_counts.count())
pd.concat([top_posters, avg_tags], axis=1).sort('social_actions', ascending=False).num_hashtags[:5]

Out[18]:

starbucks     1.774554
nike          1.241525
disney        0.620347
footlocker    1.899584
apple         0.619048
Name: num_hashtags, dtype: float64

In [19]:

# Plot most effective number of hashtags
tag_effect = instagram_data.groupby('num_hashtags').social_actions
plt.plot((tag_effect.sum() / tag_effect.count())[:5]);
plt.ylabel("Average number of likes + comments");
plt.xlabel("Number of hashtags used");

In [20]:

# Biggest hashtags users
avg_tags.order(ascending=False)[:5]

Out[20]:

account
ross_stores           29.978723
goavisbudget          18.104478
comcast_xfinity       12.000000
gamestop_worldwide     9.115385
costco_wholesale       7.326531
Name: num_hashtags, dtype: float64

In [21]:

# Find the best day of week to post
import arrow

# Create an Arrow timestamp
instagram_data['day'] = instagram_data.timestamp.apply(arrow.get, args=("YYMMDD HH:mm",))

# Format to day of week, 1 through 7
instagram_data.day = instagram_data.day.apply(format, args=("d",))

dow_effect = instagram_data.groupby('day').social_actions
(dow_effect.sum() / dow_effect.count()).plot(kind='bar');
plt.ylabel("Avg likes + comments");
plt.xticks(range(7), ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"));

Text Analysis¶

In [22]:

import nltk

In [23]:

# Import blog data from CSV file
blog_data = pd.read_csv("blog.csv")

In [24]:

blog_data.head()

Out[24]:

	id	title	summary	timestamp	fb_likes	fb_shares	linkedin_shares	pins	tweets
0	6902	15 Things You Probably Didn't Know About 'Good...	<p>“YOU LIKE APPLES?”</p> ...	131121 18:45	2910	1353	1	30	256
1	10924	New Footage Of Baby Lil Bub Is Just As Magical...	<p>Be still my heart.</p> ...	140117 12:30	408	431	0	55	92
2	10922	15 Iconic People You Had No Idea Were The Same...	<p>MIND. BLOWN.</p> ...	140117 10:00	1114	833	4	28	237
3	10921	Activist Gives Speech Inside Mitch McConnell's...	<p>“So I’m asking you the million ...	140117 11:00	1589	590	2	1	202
4	10545	The Definitive 2014 Golden Globes Eyewear Ranking	<p>Matt Damon was “the garbage man who d...	140113 19:30	259	184	7	11	111

In [25]:

# Clean up HTML tags (not quite safe!)
import re

blog_data.summary = blog_data.summary.apply(lambda s: re.sub("<[^<]+?>", "", s))

# Remove multiple spaces
blog_data.summary = blog_data.summary.apply(lambda s: re.sub(' +', ' ', s))

# Remove leading and trailing spaces
blog_data.title = blog_data.title.apply(lambda t: t.strip())
blog_data.summary = blog_data.summary.apply(lambda s: s.strip())

In [26]:

# Decode HTML entities
from lxml import html

blog_data.summary = blog_data.summary.apply(lambda s: html.fromstring(s).text)

In [27]:

# Collapse social shares
blog_data['shares'] = blog_data.fb_likes + blog_data.fb_shares + blog_data.linkedin_shares + blog_data.pins + blog_data.tweets
blog_data = blog_data.drop(['fb_likes', 'fb_shares', 'linkedin_shares', 'pins', 'tweets'], axis=1);

In [28]:

blog_data.shares.describe()

Out[28]:

count      25188.000000
mean        9532.819557
std        51112.025280
min           12.000000
25%          391.000000
50%         1194.000000
75%         4251.000000
max      3349344.000000
dtype: float64

In [29]:

# Top performing blog post
blog_data[blog_data.shares == blog_data.shares.max()]

Out[29]:

	id	title	summary	timestamp	shares
5354	12008	What Career Should You Actually Have?	Do what you love, love what you do.	140130 23:30	3349344

Top words¶

In [30]:

title_word_bag = blog_data.title.apply(lambda t: t + " ").sum()

In [31]:

# Top 10 most common words
from collections import Counter

Counter(title_word_bag.split()).most_common()[:10]

Out[31]:

[('The', 9423),
 ('To', 5390),
 ('Of', 5261),
 ('A', 5070),
 ('You', 4343),
 ('In', 3528),
 ('Is', 3206),
 ('This', 2638),
 ('And', 2555),
 ('That', 2363)]

In [32]:

# Top 10 most common non-stopwords
stopwords = [unicode(word) for word in nltk.corpus.stopwords.words('english')]
title_words = [word for word in title_word_bag.split() if word.lower() not in stopwords]
Counter(title_words).most_common()[:10]

-c:3: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal

Out[32]:

[('Things', 1543),
 ('New', 969),
 ('People', 879),
 ('Reasons', 833),
 ('Make', 810),
 ('21', 736),
 ('Best', 710),
 ('Like', 691),
 ('Know', 654),
 ('Ever', 632)]

Most common bigrams (pairs of words)¶

In [33]:

bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(title_words)

# Filter to only bigrams that appear 20+ times
bigram_finder.apply_freq_filter(20)

bigram_finder.score_ngrams(bigram_measures.raw_freq)[:10]

Out[33]:

[(('World', 'Cup'), 0.0010484454085321765),
 (('Signs', "You're"), 0.0008797300554350446),
 (("Didn't", 'Know'), 0.0008737045071101471),
 (('Daily', 'Links'), 0.000777295733911786),
 (('Look', 'Like'), 0.000777295733911786),
 (('New', 'York'), 0.0007712701855868884),
 (('Definitive', 'Ranking'), 0.0007531935406121957),
 (('Looks', 'Like'), 0.0007170402506628103),
 (('Miley', 'Cyrus'), 0.0006748614123885273),
 (("'Game", "Thrones'"), 0.0006567847674138346)]

Most common bigram collocations¶

In [34]:

# Top 10 bigrams with the highest PMI (pointwise mutual information)
bigram_finder.nbest(bigram_measures.pmi, 10)

Out[34]:

[('Mariah', 'Carey'),
 ("'Let", "Go'"),
 ("'Wrecking", "Ball'"),
 ('Leonardo', 'DiCaprio'),
 ('Zac', 'Efron'),
 ('Amy', 'Poehler'),
 ('Jay', 'Z'),
 ('Fault', "Stars'"),
 ('Los', 'Angeles'),
 ('Fab', 'Drab')]

Regression Analysis¶

Identify features¶

In [35]:

# Examine "top list"-type posts; titles begin with a number
blog_data['list_post'] = blog_data.title.apply(lambda t: t[0].isdigit())

# How many posts are "top list" posts?
float(blog_data.list_post.sum())/blog_data.list_post.count()

Out[35]:

0.37156582499602986

In [36]:

# How effective are list-type posts?
list_effect = blog_data.groupby('list_post').shares
(list_effect.sum() / list_effect.count()).plot(kind='bar');
plt.ylabel("Avg social shares");

In [37]:

# Examine "question"-type posts; title ends with a "?"
blog_data['question_post'] = blog_data.title.apply(lambda t: t[-1] is "?")

# How many posts are "question" posts?
float(blog_data.question_post.sum()) / blog_data.question_post.count()

Out[37]:

0.06229156741305383

In [38]:

# How effective are question posts?
question_effect = blog_data.groupby('question_post').shares
(question_effect.sum() / question_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares");

In [39]:

# Do you want to find out what these amazing questions are?
blog_data[blog_data.question_post].sort('shares', ascending=False).title[:10]

Out[39]:

5354                 What Career Should You Actually Have?
5174                             What Kind Of Dog Are You?
11253                What State Do You Actually Belong In?
4220               Which Decade Do You Actually Belong In?
10968               What City Should You Actually Live In?
12827                     Which Mythical Creature Are You?
9128             QUIZ: What Food Matches Your Personality?
12352    What Actress Would Play You In The Movie Versi...
3641     Which European Country Do You Actually Belong In?
4950           What Should Your College Major Actually Be?
Name: title, dtype: object

In [40]:

blog_data['actually'] = blog_data.title.apply(lambda t: "actually" in t.lower())

# How many posts are "actually" posts?
float(blog_data.actually.sum()) / blog_data.actually.count()

Out[40]:

0.021200571700809908

In [41]:

# But actually, how effective are they?
actual_effect = blog_data.groupby('actually').shares
(actual_effect.sum() / actual_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares");

In [42]:

# Examine post title length
blog_data['title_length'] = blog_data.title.apply(lambda t: len(t))

# Distribution of title lengths
blog_data.title_length.hist();
plt.ylabel("Number of posts");
plt.xlabel("Number of characters in title");

In [43]:

# Most effective title lengths
title_len_effect = blog_data.groupby('title_length').shares
plt.plot((title_len_effect.sum() / title_len_effect.count()));
plt.ylabel("Average shares");
plt.xlabel("Number of characters in title");

In [44]:

# Examine post summary length
blog_data['summary_length'] = blog_data.summary.apply(lambda t: len(t))

# Distribution of summary lengths
blog_data.summary_length.hist(bins=50);
plt.xlabel("Number of characters in summary")
plt.ylabel("Number of posts");

In [45]:

# Highly skewed distribution; save a log-transformed summary length for later
blog_data['summary_log_len'] = blog_data.summary_length.apply(log)
blog_data.summary_log_len.hist();

In [46]:

# Bin summary lengths
bins = range(0, 3000, 100)
blog_data['binned_summary_length'] = pd.cut(blog_data.summary_length, bins=bins, labels=bins[1:])

In [47]:

# Most effective post summary lengths
summary_len_effect = blog_data.groupby('binned_summary_length').shares
plt.plot(bins[1:], summary_len_effect.sum() / summary_len_effect.count());
plt.ylabel("Average shares");
plt.xlabel("Number of characters in summary");

In [48]:

import arrow

# Convert timestamp to arrow object for manipulation
blog_data['timestamp'] = blog_data.timestamp.apply(arrow.get, args=("YYMMDD HH:mm",))

In [49]:

# Day of week distribution
blog_data['dow'] = blog_data.timestamp.apply(lambda ts: int(ts.format('d')))
blog_data.dow.hist(bins=7, range=(0,8));
plt.ylabel("Total number of posts")
plt.xticks(range(8), ("", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"));

In [50]:

# Day of week effectiveness
dow_effect = blog_data.groupby('dow').shares
(dow_effect.sum() / dow_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares")
plt.xticks(range(7), ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"));

In [51]:

# Time of day distribution (hourly)
blog_data['tod'] = blog_data.timestamp.apply(lambda ts: int(ts.format('HH')))
blog_data.tod.hist(bins=24, range=(0,24));
plt.ylabel("Avg social shares");
plt.xlabel("Hour of day (ET)");

In [52]:

# Time of day effectiveness
tod_effect = blog_data.groupby('tod').shares
(tod_effect.sum() / tod_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares");

In [53]:

# Prepare readability scores based on Flesch-Kincaid Grade Level
from re import match

# Load Carnegie Mellon Pronouncing Dictionary
cmu = nltk.corpus.cmudict.dict()

def reduce_word(word):
    return ''.join([x for x in word.lower() if match(r'\w', x)])

def get_syllable_count(word):
    word = reduce_word(word)
    if (not len(word)) or (not word in cmu):
        return 0
    return len([x for x in list(''.join(list(cmu[word])[-1])) if match(r'\d', x)])
 
def get_grade_level(text):
    """Flesch-Kincaid Grade Level formula"""
    sentences = nltk.tokenize.sent_tokenize(text)
    sentence_count = len(sentences)
    word_count = 0
    syllable_count = 0
    for sentence in sentences:
        words = nltk.tokenize.word_tokenize(sentence)
        words = [reduce_word(word) for word in words]
        words = [word for word in words if word != '']
        word_count += len(words)
        syllable_count += sum([get_syllable_count(word) for word in words])
    if word_count is 0:
        return 0
    word_count = float(word_count)
    return (0.39 * (word_count / sentence_count)
            + 11.8 * (syllable_count / word_count)
            - 15.59)

blog_data['grade_level'] = blog_data.summary.apply(get_grade_level)

In [54]:

# Distribution of summary grade-level scores
blog_data.grade_level.hist(bins=30, range=(-10,20));
blog_data.grade_level.describe()

Out[54]:

count    25188.000000
mean         5.098678
std          3.986889
min        -15.200000
25%          2.589286
50%          4.823333
75%          7.430000
max         53.571852
dtype: float64

In [55]:

bins = range(-10, 20, 5)
blog_data['binned_grade_level'] = pd.cut(blog_data.grade_level, bins=bins, labels=bins[1:])

grade_lvl_effect = blog_data.groupby('binned_grade_level').shares
plt.plot(bins[1:], grade_lvl_effect.sum() / grade_lvl_effect.count());
plt.ylabel("Average shares");
plt.xlabel("Flesch-Kinkaid grade level");

In [56]:

# What are these negative scores?
blog_data.sort(['grade_level', 'shares'], ascending=[True, False])[['title', 'summary', 'grade_level', 'shares']][:10]

Out[56]:

	title	summary	grade_level	shares
13977	Hodor?	Hodor?	-15.200	18962
17542	How Misandrist Are You?	#BanMen	-15.200	17415
18336	How Many F#@ks Do You Give?	DYGAF?	-15.200	6309
13461	How Persian Are You?	Vuyyyyyy.	-15.200	5131
21488	Which Celebrity Cat Are You?	meeeeeeeeeoooooow.	-15.200	4000
15144	Should You Get Out Of Bed Today?	Hmmmmmmmmmmmmmmmmmm.	-15.200	3815
6343	How Much Do You Hate Small Talk?	Soooo………	-15.200	2256
349	FYI, Lady Gaga And Christina Aguilera Have Rel...	SLAAAAAY. youtube.com	-15.200	510
4779	Obama's Super Bowl Interview With Bill O'Reill...	JICYMI .	-15.200	192
17768	When Rihanna Met Aaron Paul	#Pinkman4President. Twitter: @rihanna	-15.005	126

(Attempting to) predict post virality...¶

Features:¶

List-type post
Question-type post
"Actually" post
Title length
Summary length
Day of week
Time of day
Summary readability

Data preprocessing¶

In [57]:

# Randomly shuffle rows
blog_data = blog_data.apply(np.random.permutation)

# Use these columns as features
feature_list = ['list_post', 
                'question_post', 
                'actually', 
                'title_length', 
                'summary_log_len', 
                'dow', 
                'tod', 
                'grade_level',
                ]

In [58]:

# Prepare only the columns we need (features + target)
reduced_blog_data = blog_data[feature_list + ['shares']]

In [59]:

# Check for NaN's
reduced_blog_data.isnull().any()

Out[59]:

list_post          False
question_post      False
actually           False
title_length       False
summary_log_len    False
dow                False
tod                False
grade_level        False
shares             False
dtype: bool

In [60]:

from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler().fit(reduced_blog_data.astype(np.float))
norm_array = scaler.transform(reduced_blog_data.astype(np.float))
norm_blog_data = pd.DataFrame(norm_array, columns=reduced_blog_data.columns)

Train the model¶

In [61]:

# Sample 80% of the data for training; keep 20% for testing
train_prop = int(.8*len(norm_blog_data))
training_set = norm_blog_data[:train_prop]
testing_set = norm_blog_data[train_prop:]

features = training_set[feature_list]
target = training_set.shares

In [62]:

from sklearn import linear_model

# Fit a linear classifier to the training data using stochastic gradient descent
# (probably not a great idea; features are likely to be highly correlated)
clf = linear_model.SGDRegressor()
clf.fit(features, target)

Out[62]:

SGDRegressor(alpha=0.0001, epsilon=0.1, eta0=0.01, fit_intercept=True,
       l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss',
       n_iter=5, penalty='l2', power_t=0.25, random_state=None,
       shuffle=False, verbose=0, warm_start=False)

In [63]:

# Predict results of testing set to measure accuracy
predicted_shares = clf.predict(testing_set[feature_list])

In [64]:

from sklearn.metrics import r2_score

# Measure the accuracy of the predictions
r2_score(testing_set.shares, predicted_shares)

Out[64]:

-0.0032258761644354816

Let's make the problem slightly more feasible...¶

Classify into ranges of virality - not an exact, continuous number of shares!¶

In [65]:

# Bin the number of shares into buckets
bins = [0, 1e3, 1e4, 1e5, blog_data.shares.max()]
blog_data['binned_shares'] = pd.cut(blog_data.shares, bins=bins, labels=bins[1:])

# Check the distribution of binned shares
blog_data.groupby('binned_shares').size()

Out[65]:

binned_shares
1000             11532
10000            10087
100000            3150
3349344            419
dtype: int64

In [66]:

# As before, prepare the data...
reduced_blog_data = blog_data[feature_list] # + ['binned_shares']

scaler = StandardScaler().fit(reduced_blog_data.astype(np.float))
norm_array = scaler.transform(reduced_blog_data.astype(np.float))
norm_blog_data = pd.DataFrame(norm_array, columns=reduced_blog_data.columns)
norm_blog_data.isnull().any()

train_prop = int(.8*len(norm_blog_data))
training_set = norm_blog_data[:train_prop]
testing_set = norm_blog_data[train_prop:]

features = training_set[feature_list]
target = blog_data[:train_prop].binned_shares.astype(str)
true_test_shares = blog_data[train_prop:].binned_shares.astype(str)

In [67]:

# Fit a classifier on the binned shares and predict
clf = linear_model.SGDClassifier()
clf.fit(features, target)
predicted_shares = clf.predict(testing_set)

In [68]:

from sklearn.metrics import accuracy_score

# Measure the accuracy of binned predictions among the 4 categories
accuracy_score(true_test_shares, predicted_shares)

Out[68]:

0.42119888844779674

In [69]:

Image("http://scikit-learn.org/stable/_static/ml_map.png")

Out[69]: